Por que R?

  • Colaboração, automação e reprodutibilidade

  • Grátis, código aberto, e disponível para várias plataformas

  • Comunidade grande

  • Número enorme de pacotes para estatística, manipulação de dados, visualização, genômica, etc

  • Permite pensar mais sobre suas análises

  • Funciona para um continuum de expertises: de usuário a programador

Como começar

Instalar o R

IDE

\label{fig:rstudio}Rstudio

Rstudio

IDE

\label{fig:nvimr}Neovim + nvim-R

Neovim + nvim-R

Dicas

  • Nunca modifique seu dado original

  • Guarde apenas seu dado de entrada, dado de resultado, e o código pra ir de um para o outro

OK, mas o que tem dentro do R?

sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin18.6.0 (64-bit)
Running under: macOS Mojave 10.14.6

Matrix products: default
BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.7/lib/libopenblasp-r0.3.7.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] compiler_3.6.1  magrittr_1.5    tools_3.6.1     htmltools_0.4.0
 [5] yaml_2.2.0      Rcpp_1.0.2      stringi_1.4.3   rmarkdown_1.16 
 [9] highr_0.8       knitr_1.25      stringr_1.4.0   xfun_0.10      
[13] digest_0.6.21   rlang_0.4.0     evaluate_0.14  

help(package = "base")

Funções

  • sum
  • log
  • exp
  • mean
  • median
  • var
  • sd
  • sqrt
  • lm
  • apply

?Syntax

Objetos

x <- 10
a <- "banana"
t <- TRUE

x + 1
[1] 11
x ^ 2
[1] 100

numbers <- 1:10
numbers + 1
 [1]  2  3  4  5  6  7  8  9 10 11
numbers ^ 2
 [1]   1   4   9  16  25  36  49  64  81 100
mean(numbers)
[1] 5.5

Classes de objetos

vetor

#integer
inteiros <- 1:5 

# double
numeros <- c(1, 2, 3, 4, 5) 

# character
frutas <- c("banana", "maçã", "laranja") 

# logical
resultados <- c(TRUE, FALSE, FALSE, TRUE) 

lista

lista <- list(inteiros = inteiros, 
              frutas = frutas, 
              resultados = resultados)

lista
$inteiros
[1] 1 2 3 4 5

$frutas
[1] "banana"  "maçã"    "laranja"

$resultados
[1]  TRUE FALSE FALSE  TRUE

data.frame

d <- data.frame(inteiro = 1:3,
                fruta = c("banana", "maçã", "laranja"),
                resultado = c(TRUE, FALSE, FALSE))

d
  inteiro   fruta resultado
1       1  banana      TRUE
2       2    maçã     FALSE
3       3 laranja     FALSE

tidyverse

  • importar dados -> tidy -> transformar -> explorar -> visualizar

Gramática de manipulação de dados

  • mutate
  • select
  • filter
  • summarise
  • arrange
  • pivot
  • nest

library(tidyverse)
── Attaching packages ─────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.2.1     ✔ purrr   0.3.2
✔ tibble  2.1.3     ✔ dplyr   0.8.3
✔ tidyr   1.0.0     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

tidy data

  • Cada coluna é uma variável

  • Cada linha é uma observação

  • Cada célula é um valor

Pipe

%>%

pew <- read_csv("./data/pew.csv")
Parsed with column specification:
cols(
  religion = col_character(),
  `<$10k` = col_double(),
  `$10-20k` = col_double(),
  `$20-30k` = col_double(),
  `$30-40k` = col_double(),
  `$40-50k` = col_double(),
  `$50-75k` = col_double(),
  `$75-100k` = col_double(),
  `$100-150k` = col_double(),
  `>150k` = col_double(),
  `Don't know/refused` = col_double()
)

pew
# A tibble: 18 x 11
   religion `<$10k` `$10-20k` `$20-30k` `$30-40k` `$40-50k` `$50-75k` `$75-100k`
   <chr>      <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>      <dbl>
 1 Agnostic      27        34        60        81        76       137        122
 2 Atheist       12        27        37        52        35        70         73
 3 Buddhist      27        21        30        34        33        58         62
 4 Catholic     418       617       732       670       638      1116        949
 5 Don’t k…      15        14        15        11        10        35         21
 6 Evangel…     575       869      1064       982       881      1486        949
 7 Hindu          1         9         7         9        11        34         47
 8 Histori…     228       244       236       238       197       223        131
 9 Jehovah…      20        27        24        24        21        30         15
10 Jewish        19        19        25        25        30        95         69
# … with 8 more rows, and 3 more variables: `$100-150k` <dbl>, `>150k` <dbl>,
#   `Don't know/refused` <dbl>

pew %>%
    pivot_longer(-religion, names_to = "income", values_to = "count")
# A tibble: 180 x 3
   religion income             count
   <chr>    <chr>              <dbl>
 1 Agnostic <$10k                 27
 2 Agnostic $10-20k               34
 3 Agnostic $20-30k               60
 4 Agnostic $30-40k               81
 5 Agnostic $40-50k               76
 6 Agnostic $50-75k              137
 7 Agnostic $75-100k             122
 8 Agnostic $100-150k            109
 9 Agnostic >150k                 84
10 Agnostic Don't know/refused    96
# … with 170 more rows

who
# A tibble: 7,240 x 60
   country iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544
   <chr>   <chr> <chr> <int>       <int>        <int>        <int>        <int>
 1 Afghan… AF    AFG    1980          NA           NA           NA           NA
 2 Afghan… AF    AFG    1981          NA           NA           NA           NA
 3 Afghan… AF    AFG    1982          NA           NA           NA           NA
 4 Afghan… AF    AFG    1983          NA           NA           NA           NA
 5 Afghan… AF    AFG    1984          NA           NA           NA           NA
 6 Afghan… AF    AFG    1985          NA           NA           NA           NA
 7 Afghan… AF    AFG    1986          NA           NA           NA           NA
 8 Afghan… AF    AFG    1987          NA           NA           NA           NA
 9 Afghan… AF    AFG    1988          NA           NA           NA           NA
10 Afghan… AF    AFG    1989          NA           NA           NA           NA
# … with 7,230 more rows, and 52 more variables: new_sp_m4554 <int>,
#   new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
#   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
#   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
#   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
#   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
#   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
#   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
#   new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
#   new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
#   new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
#   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
#   new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
#   new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
#   newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
#   newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
#   newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
#   newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>

who %>%
    pivot_longer(-(country:year), names_to = "info", values_to = "count")
# A tibble: 405,440 x 6
   country     iso2  iso3   year info         count
   <chr>       <chr> <chr> <int> <chr>        <int>
 1 Afghanistan AF    AFG    1980 new_sp_m014     NA
 2 Afghanistan AF    AFG    1980 new_sp_m1524    NA
 3 Afghanistan AF    AFG    1980 new_sp_m2534    NA
 4 Afghanistan AF    AFG    1980 new_sp_m3544    NA
 5 Afghanistan AF    AFG    1980 new_sp_m4554    NA
 6 Afghanistan AF    AFG    1980 new_sp_m5564    NA
 7 Afghanistan AF    AFG    1980 new_sp_m65      NA
 8 Afghanistan AF    AFG    1980 new_sp_f014     NA
 9 Afghanistan AF    AFG    1980 new_sp_f1524    NA
10 Afghanistan AF    AFG    1980 new_sp_f2534    NA
# … with 405,430 more rows

who %>%
    pivot_longer(-(country:year), names_to = "info", values_to = "count") %>%
    extract(info, c("diagnosis", "gender", "age"), "new_?(.*)_(.)(.+)") 
# A tibble: 405,440 x 8
   country     iso2  iso3   year diagnosis gender age   count
   <chr>       <chr> <chr> <int> <chr>     <chr>  <chr> <int>
 1 Afghanistan AF    AFG    1980 sp        m      014      NA
 2 Afghanistan AF    AFG    1980 sp        m      1524     NA
 3 Afghanistan AF    AFG    1980 sp        m      2534     NA
 4 Afghanistan AF    AFG    1980 sp        m      3544     NA
 5 Afghanistan AF    AFG    1980 sp        m      4554     NA
 6 Afghanistan AF    AFG    1980 sp        m      5564     NA
 7 Afghanistan AF    AFG    1980 sp        m      65       NA
 8 Afghanistan AF    AFG    1980 sp        f      014      NA
 9 Afghanistan AF    AFG    1980 sp        f      1524     NA
10 Afghanistan AF    AFG    1980 sp        f      2534     NA
# … with 405,430 more rows

tidy_who <- who %>%
    pivot_longer(-(country:year), names_to = "info", values_to = "count") %>%
    extract(info, c("diagnosis", "gender", "age"), "new_?(.*)_(.)(.+)") 

who_bra <- tidy_who %>% 
    filter(country == "Brazil")
who_bra
# A tibble: 1,904 x 8
   country iso2  iso3   year diagnosis gender age   count
   <chr>   <chr> <chr> <int> <chr>     <chr>  <chr> <int>
 1 Brazil  BR    BRA    1980 sp        m      014      NA
 2 Brazil  BR    BRA    1980 sp        m      1524     NA
 3 Brazil  BR    BRA    1980 sp        m      2534     NA
 4 Brazil  BR    BRA    1980 sp        m      3544     NA
 5 Brazil  BR    BRA    1980 sp        m      4554     NA
 6 Brazil  BR    BRA    1980 sp        m      5564     NA
 7 Brazil  BR    BRA    1980 sp        m      65       NA
 8 Brazil  BR    BRA    1980 sp        f      014      NA
 9 Brazil  BR    BRA    1980 sp        f      1524     NA
10 Brazil  BR    BRA    1980 sp        f      2534     NA
# … with 1,894 more rows

who_bra %>% 
    filter(!is.na(count))
# A tibble: 406 x 8
   country iso2  iso3   year diagnosis gender age   count
   <chr>   <chr> <chr> <int> <chr>     <chr>  <chr> <int>
 1 Brazil  BR    BRA    1999 sp        m      014     301
 2 Brazil  BR    BRA    1999 sp        m      1524   3662
 3 Brazil  BR    BRA    1999 sp        m      2534   5401
 4 Brazil  BR    BRA    1999 sp        m      3544   5827
 5 Brazil  BR    BRA    1999 sp        m      4554   4630
 6 Brazil  BR    BRA    1999 sp        m      5564   2634
 7 Brazil  BR    BRA    1999 sp        m      65     2121
 8 Brazil  BR    BRA    1999 sp        f      014     372
 9 Brazil  BR    BRA    1999 sp        f      1524   2909
10 Brazil  BR    BRA    1999 sp        f      2534   3450
# … with 396 more rows

who_bra %>%
    filter(!is.na(count)) %>%
    group_by(year, diagnosis) %>%
    summarise(count = sum(count))
# A tibble: 29 x 3
# Groups:   year [15]
    year diagnosis count
   <int> <chr>     <int>
 1  1999 sp        37737
 2  2000 sp        80488
 3  2001 sp        37491
 4  2002 sp        40723
 5  2003 sp        39883
 6  2004 sp        42881
 7  2005 sp        42093
 8  2006 ep        10656
 9  2006 sn        22585
10  2006 sp        41117
# … with 19 more rows

Arquivo VCF

vcf <- read_tsv("./data/chr22.vcf", comment = "##")

vcf
# A tibble: 999 x 2,513
   `#CHROM`    POS ID    REF   ALT    QUAL FILTER INFO  FORMAT HG00096 HG00097
      <dbl>  <dbl> <chr> <chr> <chr> <dbl> <chr>  <chr> <chr>  <chr>   <chr>  
 1       22 1.61e7 rs58… A     G       100 PASS   AC=1… GT     0|0     0|0    
 2       22 1.61e7 rs58… G     A       100 PASS   AC=3… GT     0|0     0|0    
 3       22 1.61e7 rs58… C     T       100 PASS   AC=3… GT     0|0     0|0    
 4       22 1.61e7 rs58… C     T       100 PASS   AC=1… GT     0|0     0|0    
 5       22 1.61e7 rs58… C     A       100 PASS   AC=1… GT     0|0     0|0    
 6       22 1.61e7 rs58… C     A       100 PASS   AC=2… GT     0|0     0|0    
 7       22 1.61e7 rs58… G     A       100 PASS   AC=5… GT     0|0     0|0    
 8       22 1.61e7 rs58… G     T       100 PASS   AC=2… GT     0|0     0|0    
 9       22 1.61e7 rs58… G     T       100 PASS   AC=1… GT     0|0     0|0    
10       22 1.61e7 esv3… A     <CN0…   100 PASS   AC=9… GT     3|0     0|0    
# … with 989 more rows, and 2,502 more variables: HG00099 <chr>, HG00100 <chr>,
#   HG00101 <chr>, HG00102 <chr>, HG00103 <chr>, HG00105 <chr>, HG00106 <chr>,
#   HG00107 <chr>, HG00108 <chr>, HG00109 <chr>, HG00110 <chr>, HG00111 <chr>,
#   HG00112 <chr>, HG00113 <chr>, HG00114 <chr>, HG00115 <chr>, HG00116 <chr>,
#   HG00117 <chr>, HG00118 <chr>, HG00119 <chr>, HG00120 <chr>, HG00121 <chr>,
#   HG00122 <chr>, HG00123 <chr>, HG00125 <chr>, HG00126 <chr>, HG00127 <chr>,
#   HG00128 <chr>, HG00129 <chr>, HG00130 <chr>, HG00131 <chr>, HG00132 <chr>,
#   HG00133 <chr>, HG00136 <chr>, HG00137 <chr>, HG00138 <chr>, HG00139 <chr>,
#   HG00140 <chr>, HG00141 <chr>, HG00142 <chr>, HG00143 <chr>, HG00145 <chr>,
#   HG00146 <chr>, HG00148 <chr>, HG00149 <chr>, HG00150 <chr>, HG00151 <chr>,
#   HG00154 <chr>, HG00155 <chr>, HG00157 <chr>, HG00158 <chr>, HG00159 <chr>,
#   HG00160 <chr>, HG00171 <chr>, HG00173 <chr>, HG00174 <chr>, HG00176 <chr>,
#   HG00177 <chr>, HG00178 <chr>, HG00179 <chr>, HG00180 <chr>, HG00181 <chr>,
#   HG00182 <chr>, HG00183 <chr>, HG00185 <chr>, HG00186 <chr>, HG00187 <chr>,
#   HG00188 <chr>, HG00189 <chr>, HG00190 <chr>, HG00231 <chr>, HG00232 <chr>,
#   HG00233 <chr>, HG00234 <chr>, HG00235 <chr>, HG00236 <chr>, HG00237 <chr>,
#   HG00238 <chr>, HG00239 <chr>, HG00240 <chr>, HG00242 <chr>, HG00243 <chr>,
#   HG00244 <chr>, HG00245 <chr>, HG00246 <chr>, HG00250 <chr>, HG00251 <chr>,
#   HG00252 <chr>, HG00253 <chr>, HG00254 <chr>, HG00255 <chr>, HG00256 <chr>,
#   HG00257 <chr>, HG00258 <chr>, HG00259 <chr>, HG00260 <chr>, HG00261 <chr>,
#   HG00262 <chr>, HG00263 <chr>, HG00264 <chr>, …

vcf %>% select(INFO) 
# A tibble: 999 x 1
   INFO                                                                         
   <chr>                                                                        
 1 AC=1;AF=0.000199681;AN=5008;NS=2504;DP=8012;EAS_AF=0;AMR_AF=0;AFR_AF=0;EUR_A…
 2 AC=32;AF=0.00638978;AN=5008;NS=2504;DP=11468;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0…
 3 AC=38;AF=0.00758786;AN=5008;NS=2504;DP=15092;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0…
 4 AC=1;AF=0.000199681;AN=5008;NS=2504;DP=22609;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0…
 5 AC=1;AF=0.000199681;AN=5008;NS=2504;DP=23591;EAS_AF=0;AMR_AF=0;AFR_AF=0;EUR_…
 6 AC=2;AF=0.000399361;AN=5008;NS=2504;DP=21258;EAS_AF=0.002;AMR_AF=0;AFR_AF=0;…
 7 AC=5;AF=0.000998403;AN=5008;NS=2504;DP=20274;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0…
 8 AC=2;AF=0.000399361;AN=5008;NS=2504;DP=21022;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0…
 9 AC=1;AF=0.000199681;AN=5008;NS=2504;DP=22073;EAS_AF=0;AMR_AF=0;AFR_AF=0;EUR_…
10 AC=9,87,599,20;AF=0.00179712,0.0173722,0.119609,0.00399361;AN=5008;CS=DUP_gs…
# … with 989 more rows

vcf %>% 
    extract(INFO, "var_type", "VT=(.+);?")
# A tibble: 999 x 2,513
   `#CHROM`    POS ID    REF   ALT    QUAL FILTER var_type FORMAT HG00096
      <dbl>  <dbl> <chr> <chr> <chr> <dbl> <chr>  <chr>    <chr>  <chr>  
 1       22 1.61e7 rs58… A     G       100 PASS   SNP      GT     0|0    
 2       22 1.61e7 rs58… G     A       100 PASS   SNP      GT     0|0    
 3       22 1.61e7 rs58… C     T       100 PASS   SNP      GT     0|0    
 4       22 1.61e7 rs58… C     T       100 PASS   SNP      GT     0|0    
 5       22 1.61e7 rs58… C     A       100 PASS   SNP      GT     0|0    
 6       22 1.61e7 rs58… C     A       100 PASS   SNP      GT     0|0    
 7       22 1.61e7 rs58… G     A       100 PASS   SNP      GT     0|0    
 8       22 1.61e7 rs58… G     T       100 PASS   SNP      GT     0|0    
 9       22 1.61e7 rs58… G     T       100 PASS   SNP      GT     0|0    
10       22 1.61e7 esv3… A     <CN0…   100 PASS   SV       GT     3|0    
# … with 989 more rows, and 2,503 more variables: HG00097 <chr>, HG00099 <chr>,
#   HG00100 <chr>, HG00101 <chr>, HG00102 <chr>, HG00103 <chr>, HG00105 <chr>,
#   HG00106 <chr>, HG00107 <chr>, HG00108 <chr>, HG00109 <chr>, HG00110 <chr>,
#   HG00111 <chr>, HG00112 <chr>, HG00113 <chr>, HG00114 <chr>, HG00115 <chr>,
#   HG00116 <chr>, HG00117 <chr>, HG00118 <chr>, HG00119 <chr>, HG00120 <chr>,
#   HG00121 <chr>, HG00122 <chr>, HG00123 <chr>, HG00125 <chr>, HG00126 <chr>,
#   HG00127 <chr>, HG00128 <chr>, HG00129 <chr>, HG00130 <chr>, HG00131 <chr>,
#   HG00132 <chr>, HG00133 <chr>, HG00136 <chr>, HG00137 <chr>, HG00138 <chr>,
#   HG00139 <chr>, HG00140 <chr>, HG00141 <chr>, HG00142 <chr>, HG00143 <chr>,
#   HG00145 <chr>, HG00146 <chr>, HG00148 <chr>, HG00149 <chr>, HG00150 <chr>,
#   HG00151 <chr>, HG00154 <chr>, HG00155 <chr>, HG00157 <chr>, HG00158 <chr>,
#   HG00159 <chr>, HG00160 <chr>, HG00171 <chr>, HG00173 <chr>, HG00174 <chr>,
#   HG00176 <chr>, HG00177 <chr>, HG00178 <chr>, HG00179 <chr>, HG00180 <chr>,
#   HG00181 <chr>, HG00182 <chr>, HG00183 <chr>, HG00185 <chr>, HG00186 <chr>,
#   HG00187 <chr>, HG00188 <chr>, HG00189 <chr>, HG00190 <chr>, HG00231 <chr>,
#   HG00232 <chr>, HG00233 <chr>, HG00234 <chr>, HG00235 <chr>, HG00236 <chr>,
#   HG00237 <chr>, HG00238 <chr>, HG00239 <chr>, HG00240 <chr>, HG00242 <chr>,
#   HG00243 <chr>, HG00244 <chr>, HG00245 <chr>, HG00246 <chr>, HG00250 <chr>,
#   HG00251 <chr>, HG00252 <chr>, HG00253 <chr>, HG00254 <chr>, HG00255 <chr>,
#   HG00256 <chr>, HG00257 <chr>, HG00258 <chr>, HG00259 <chr>, HG00260 <chr>,
#   HG00261 <chr>, HG00262 <chr>, HG00263 <chr>, …

vcf %>% 
    extract(INFO, "var_type", "VT=(.+);?") %>%
    count(var_type)
# A tibble: 4 x 2
  var_type              n
  <chr>             <int>
1 INDEL                31
2 SNP                 962
3 SNP;MULTI_ALLELIC     5
4 SV                    1

vcf %>% 
    extract(INFO, "var_type", "VT=(.+);?") %>%
    select(POS, REF, ALT, var_type, starts_with("HG"), starts_with("NA"))
# A tibble: 999 x 2,508
      POS REF   ALT   var_type HG00096 HG00097 HG00099 HG00100 HG00101 HG00102
    <dbl> <chr> <chr> <chr>    <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
 1 1.61e7 A     G     SNP      0|0     0|0     0|0     0|0     0|0     0|0    
 2 1.61e7 G     A     SNP      0|0     0|0     0|0     0|0     0|0     0|0    
 3 1.61e7 C     T     SNP      0|0     0|0     0|0     0|0     0|0     0|0    
 4 1.61e7 C     T     SNP      0|0     0|0     0|0     0|0     0|0     0|0    
 5 1.61e7 C     A     SNP      0|0     0|0     0|0     0|0     0|0     0|0    
 6 1.61e7 C     A     SNP      0|0     0|0     0|0     0|0     0|0     0|0    
 7 1.61e7 G     A     SNP      0|0     0|0     0|0     0|0     0|0     0|0    
 8 1.61e7 G     T     SNP      0|0     0|0     0|0     0|0     0|0     0|0    
 9 1.61e7 G     T     SNP      0|0     0|0     0|0     0|0     0|0     0|0    
10 1.61e7 A     <CN0… SV       3|0     0|0     0|0     0|0     0|0     0|0    
# … with 989 more rows, and 2,498 more variables: HG00103 <chr>, HG00105 <chr>,
#   HG00106 <chr>, HG00107 <chr>, HG00108 <chr>, HG00109 <chr>, HG00110 <chr>,
#   HG00111 <chr>, HG00112 <chr>, HG00113 <chr>, HG00114 <chr>, HG00115 <chr>,
#   HG00116 <chr>, HG00117 <chr>, HG00118 <chr>, HG00119 <chr>, HG00120 <chr>,
#   HG00121 <chr>, HG00122 <chr>, HG00123 <chr>, HG00125 <chr>, HG00126 <chr>,
#   HG00127 <chr>, HG00128 <chr>, HG00129 <chr>, HG00130 <chr>, HG00131 <chr>,
#   HG00132 <chr>, HG00133 <chr>, HG00136 <chr>, HG00137 <chr>, HG00138 <chr>,
#   HG00139 <chr>, HG00140 <chr>, HG00141 <chr>, HG00142 <chr>, HG00143 <chr>,
#   HG00145 <chr>, HG00146 <chr>, HG00148 <chr>, HG00149 <chr>, HG00150 <chr>,
#   HG00151 <chr>, HG00154 <chr>, HG00155 <chr>, HG00157 <chr>, HG00158 <chr>,
#   HG00159 <chr>, HG00160 <chr>, HG00171 <chr>, HG00173 <chr>, HG00174 <chr>,
#   HG00176 <chr>, HG00177 <chr>, HG00178 <chr>, HG00179 <chr>, HG00180 <chr>,
#   HG00181 <chr>, HG00182 <chr>, HG00183 <chr>, HG00185 <chr>, HG00186 <chr>,
#   HG00187 <chr>, HG00188 <chr>, HG00189 <chr>, HG00190 <chr>, HG00231 <chr>,
#   HG00232 <chr>, HG00233 <chr>, HG00234 <chr>, HG00235 <chr>, HG00236 <chr>,
#   HG00237 <chr>, HG00238 <chr>, HG00239 <chr>, HG00240 <chr>, HG00242 <chr>,
#   HG00243 <chr>, HG00244 <chr>, HG00245 <chr>, HG00246 <chr>, HG00250 <chr>,
#   HG00251 <chr>, HG00252 <chr>, HG00253 <chr>, HG00254 <chr>, HG00255 <chr>,
#   HG00256 <chr>, HG00257 <chr>, HG00258 <chr>, HG00259 <chr>, HG00260 <chr>,
#   HG00261 <chr>, HG00262 <chr>, HG00263 <chr>, HG00264 <chr>, HG00265 <chr>,
#   HG00266 <chr>, HG00267 <chr>, HG00268 <chr>, …

vcf %>% 
    extract(INFO, "var_type", "VT=(.+);?") %>%
    select(POS, REF, ALT, var_type, starts_with("HG"), starts_with("NA")) %>%
    pivot_longer(-(1:4), names_to = "id", values_to = "genotype")
# A tibble: 2,501,496 x 6
        POS REF   ALT   var_type id      genotype
      <dbl> <chr> <chr> <chr>    <chr>   <chr>   
 1 16050075 A     G     SNP      HG00096 0|0     
 2 16050075 A     G     SNP      HG00097 0|0     
 3 16050075 A     G     SNP      HG00099 0|0     
 4 16050075 A     G     SNP      HG00100 0|0     
 5 16050075 A     G     SNP      HG00101 0|0     
 6 16050075 A     G     SNP      HG00102 0|0     
 7 16050075 A     G     SNP      HG00103 0|0     
 8 16050075 A     G     SNP      HG00105 0|0     
 9 16050075 A     G     SNP      HG00106 0|0     
10 16050075 A     G     SNP      HG00107 0|0     
# … with 2,501,486 more rows

vcf %>% 
    extract(INFO, "var_type", "VT=(.+);?") %>%
    select(POS, REF, ALT, var_type, starts_with("HG"), starts_with("NA")) %>%
    pivot_longer(-(1:4), names_to = "id", values_to = "genotype") %>%
    separate(genotype, c("hap1", "hap2"), sep = "\\|")
# A tibble: 2,501,496 x 7
        POS REF   ALT   var_type id      hap1  hap2 
      <dbl> <chr> <chr> <chr>    <chr>   <chr> <chr>
 1 16050075 A     G     SNP      HG00096 0     0    
 2 16050075 A     G     SNP      HG00097 0     0    
 3 16050075 A     G     SNP      HG00099 0     0    
 4 16050075 A     G     SNP      HG00100 0     0    
 5 16050075 A     G     SNP      HG00101 0     0    
 6 16050075 A     G     SNP      HG00102 0     0    
 7 16050075 A     G     SNP      HG00103 0     0    
 8 16050075 A     G     SNP      HG00105 0     0    
 9 16050075 A     G     SNP      HG00106 0     0    
10 16050075 A     G     SNP      HG00107 0     0    
# … with 2,501,486 more rows

Análise exploratória de dados

Caixeta

caixeta <- 
  "http://ecologia.ib.usp.br/bie5782/lib/exe/fetch.php?media=dados:caixeta.csv" %>%
  read_csv() %>%
  arrange(local, arvore, fuste)

caixeta
# A tibble: 1,027 x 7
   local  parcela arvore fuste   cap     h especie               
   <chr>    <dbl>  <dbl> <dbl> <dbl> <dbl> <chr>                 
 1 chauas       1      1     1   210    80 Myrcia sulfiflora     
 2 chauas       1      3     1   170    80 Myrcia sulfiflora     
 3 chauas       1      4     1   720    70 Syagrus romanzoffianus
 4 chauas       1      5     1   200    80 Tabebuia cassinoides  
 5 chauas       1      6     1   750   170 indet.1               
 6 chauas       1      7     1   320    80 Myrcia sulfiflora     
 7 chauas       1      8     1   480   160 Tabebuia cassinoides  
 8 chauas       1      9     1   240   140 Tabebuia cassinoides  
 9 chauas       1     10     1   290   120 Tabebuia cassinoides  
10 chauas       1     10     2   310   120 Tabebuia cassinoides  
# … with 1,017 more rows

caixeta %>%
  group_by(local, arvore) %>%
  filter(n_distinct(especie) > 1)
# A tibble: 4 x 7
# Groups:   local, arvore [2]
  local  parcela arvore fuste   cap     h especie             
  <chr>    <dbl>  <dbl> <dbl> <dbl> <dbl> <chr>               
1 jureia       4    117     1   370    90 Psidium sp          
2 jureia       4    117     2   510   130 Tabebuia cassinoides
3 retiro       1     16     1   330    80 Mela 1              
4 retiro       1     16     2   135    80 Tabebuia cassinoides

caixeta %>%
  group_by(local, arvore) %>%
  filter(n_distinct(h) > 1)
# A tibble: 17 x 7
# Groups:   local, arvore [3]
   local  parcela arvore fuste   cap     h especie             
   <chr>    <dbl>  <dbl> <dbl> <dbl> <dbl> <chr>               
 1 jureia       4    117     1   370    90 Psidium sp          
 2 jureia       4    117     2   510   130 Tabebuia cassinoides
 3 retiro       1     26     1   185    60 Tabebuia cassinoides
 4 retiro       1     26     2   265    60 Tabebuia cassinoides
 5 retiro       1     26     3    70    60 Tabebuia cassinoides
 6 retiro       1     26     4    70    60 Tabebuia cassinoides
 7 retiro       1     26     5    45    60 Tabebuia cassinoides
 8 retiro       1     26     6   110    60 Tabebuia cassinoides
 9 retiro       1     26     7   275   160 Tabebuia cassinoides
10 retiro       1     28     1   390    90 Tabebuia cassinoides
# … with 7 more rows

Expressão gênica

genexp <- read_tsv("./data/expression_testdata.tsv")
Parsed with column specification:
cols(
  id = col_character(),
  gene = col_character(),
  qpcr = col_double(),
  rnaseq = col_double()
)

genexp 
# A tibble: 288 x 4
   id    gene   qpcr rnaseq
   <chr> <chr> <dbl>  <dbl>
 1 ind1  HLA-A 0.649   562.
 2 ind1  HLA-B 1.05    803.
 3 ind1  HLA-C 0.67    462.
 4 ind2  HLA-A 0.34    526.
 5 ind2  HLA-B 1.31    887.
 6 ind2  HLA-C 2.74    771.
 7 ind3  HLA-A 0.489   351.
 8 ind3  HLA-B 1.23    686.
 9 ind3  HLA-C 3.42    462.
10 ind4  HLA-A 0.88    770.
# … with 278 more rows

genexp %>% 
    split(.$gene) 
$`HLA-A`
# A tibble: 96 x 4
   id    gene   qpcr rnaseq
   <chr> <chr> <dbl>  <dbl>
 1 ind1  HLA-A 0.649   562.
 2 ind2  HLA-A 0.34    526.
 3 ind3  HLA-A 0.489   351.
 4 ind4  HLA-A 0.88    770.
 5 ind5  HLA-A 0.655   407.
 6 ind6  HLA-A 2.46    827.
 7 ind7  HLA-A 0.433   440.
 8 ind8  HLA-A 0.312   377.
 9 ind9  HLA-A 0.295   351.
10 ind10 HLA-A 0.55    447.
# … with 86 more rows

$`HLA-B`
# A tibble: 96 x 4
   id    gene   qpcr rnaseq
   <chr> <chr> <dbl>  <dbl>
 1 ind1  HLA-B 1.05    803.
 2 ind2  HLA-B 1.31    887.
 3 ind3  HLA-B 1.23    686.
 4 ind4  HLA-B 1.08   1403.
 5 ind5  HLA-B 1.33    837.
 6 ind6  HLA-B 1.00   1243.
 7 ind7  HLA-B 1.08    715.
 8 ind8  HLA-B 1.32    701.
 9 ind9  HLA-B 0.791   624.
10 ind10 HLA-B 1.52    944.
# … with 86 more rows

$`HLA-C`
# A tibble: 96 x 4
   id    gene   qpcr rnaseq
   <chr> <chr> <dbl>  <dbl>
 1 ind1  HLA-C  0.67   462.
 2 ind2  HLA-C  2.74   771.
 3 ind3  HLA-C  3.42   462.
 4 ind4  HLA-C  6.1   1002.
 5 ind5  HLA-C  1.98   538.
 6 ind6  HLA-C  3.20   713.
 7 ind7  HLA-C  1.26   364.
 8 ind8  HLA-C  2.89   384.
 9 ind9  HLA-C  1.81   362.
10 ind10 HLA-C  1.95   504.
# … with 86 more rows

genexp %>% 
    split(.$gene) %>%
    map(~cor.test(.x$qpcr, .x$rnaseq)) %>% 
    map(broom::tidy)
$`HLA-A`
# A tibble: 1 x 8
  estimate statistic  p.value parameter conf.low conf.high method    alternative
     <dbl>     <dbl>    <dbl>     <int>    <dbl>     <dbl> <chr>     <chr>      
1    0.558      6.53  3.41e-9        94    0.403     0.682 Pearson'… two.sided  

$`HLA-B`
# A tibble: 1 x 8
  estimate statistic p.value parameter conf.low conf.high method     alternative
     <dbl>     <dbl>   <dbl>     <int>    <dbl>     <dbl> <chr>      <chr>      
1    0.172      1.69  0.0935        94  -0.0294     0.360 Pearson's… two.sided  

$`HLA-C`
# A tibble: 1 x 8
  estimate statistic  p.value parameter conf.low conf.high method    alternative
     <dbl>     <dbl>    <dbl>     <int>    <dbl>     <dbl> <chr>     <chr>      
1    0.515      5.83  7.72e-8        94    0.351     0.649 Pearson'… two.sided  

genexp %>% 
    split(.$gene) %>%
    map(~cor.test(.x$qpcr, .x$rnaseq)) %>% 
    map_df(broom::tidy, .id = "gene")
# A tibble: 3 x 9
  gene  estimate statistic p.value parameter conf.low conf.high method
  <chr>    <dbl>     <dbl>   <dbl>     <int>    <dbl>     <dbl> <chr> 
1 HLA-A    0.558      6.53 3.41e-9        94   0.403      0.682 Pears…
2 HLA-B    0.172      1.69 9.35e-2        94  -0.0294     0.360 Pears…
3 HLA-C    0.515      5.83 7.72e-8        94   0.351      0.649 Pears…
# … with 1 more variable: alternative <chr>

Visualização

Gapminder data

gapminder_lifeExp <- 
    read_csv("./data/life_expectancy_years.csv")
gapminder_income <- 
    read_csv("./data/income_per_person_gdppercapita_ppp_inflation_adjusted.csv")
gapminder_population <- 
    read_csv("./data/population_total.csv")
gapminder_regions <- 
    readxl::read_excel("./data/Data Geographies - v1 - by Gapminder.xlsx", 2)

gapminder_lifeExp
# A tibble: 187 x 220
   country `1800` `1801` `1802` `1803` `1804` `1805` `1806` `1807` `1808` `1809`
   <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 Afghan…   28.2   28.2   28.2   28.2   28.2   28.2   28.1   28.1   28.1   28.1
 2 Albania   35.4   35.4   35.4   35.4   35.4   35.4   35.4   35.4   35.4   35.4
 3 Algeria   28.8   28.8   28.8   28.8   28.8   28.8   28.8   28.8   28.8   28.8
 4 Andorra   NA     NA     NA     NA     NA     NA     NA     NA     NA     NA  
 5 Angola    27     27     27     27     27     27     27     27     27     27  
 6 Antigu…   33.5   33.5   33.5   33.5   33.5   33.5   33.5   33.5   33.5   33.5
 7 Argent…   33.2   33.2   33.2   33.2   33.2   33.2   33.2   33.2   33.2   33.2
 8 Armenia   34     34     34     34     34     34     34     34     34     34  
 9 Austra…   34     34     34     34     34     34     34     34     34     34  
10 Austria   34.4   34.4   34.4   34.4   34.4   34.4   34.4   34.4   34.4   34.4
# … with 177 more rows, and 209 more variables: `1810` <dbl>, `1811` <dbl>,
#   `1812` <dbl>, `1813` <dbl>, `1814` <dbl>, `1815` <dbl>, `1816` <dbl>,
#   `1817` <dbl>, `1818` <dbl>, `1819` <dbl>, `1820` <dbl>, `1821` <dbl>,
#   `1822` <dbl>, `1823` <dbl>, `1824` <dbl>, `1825` <dbl>, `1826` <dbl>,
#   `1827` <dbl>, `1828` <dbl>, `1829` <dbl>, `1830` <dbl>, `1831` <dbl>,
#   `1832` <dbl>, `1833` <dbl>, `1834` <dbl>, `1835` <dbl>, `1836` <dbl>,
#   `1837` <dbl>, `1838` <dbl>, `1839` <dbl>, `1840` <dbl>, `1841` <dbl>,
#   `1842` <dbl>, `1843` <dbl>, `1844` <dbl>, `1845` <dbl>, `1846` <dbl>,
#   `1847` <dbl>, `1848` <dbl>, `1849` <dbl>, `1850` <dbl>, `1851` <dbl>,
#   `1852` <dbl>, `1853` <dbl>, `1854` <dbl>, `1855` <dbl>, `1856` <dbl>,
#   `1857` <dbl>, `1858` <dbl>, `1859` <dbl>, `1860` <dbl>, `1861` <dbl>,
#   `1862` <dbl>, `1863` <dbl>, `1864` <dbl>, `1865` <dbl>, `1866` <dbl>,
#   `1867` <dbl>, `1868` <dbl>, `1869` <dbl>, `1870` <dbl>, `1871` <dbl>,
#   `1872` <dbl>, `1873` <dbl>, `1874` <dbl>, `1875` <dbl>, `1876` <dbl>,
#   `1877` <dbl>, `1878` <dbl>, `1879` <dbl>, `1880` <dbl>, `1881` <dbl>,
#   `1882` <dbl>, `1883` <dbl>, `1884` <dbl>, `1885` <dbl>, `1886` <dbl>,
#   `1887` <dbl>, `1888` <dbl>, `1889` <dbl>, `1890` <dbl>, `1891` <dbl>,
#   `1892` <dbl>, `1893` <dbl>, `1894` <dbl>, `1895` <dbl>, `1896` <dbl>,
#   `1897` <dbl>, `1898` <dbl>, `1899` <dbl>, `1900` <dbl>, `1901` <dbl>,
#   `1902` <dbl>, `1903` <dbl>, `1904` <dbl>, `1905` <dbl>, `1906` <dbl>,
#   `1907` <dbl>, `1908` <dbl>, `1909` <dbl>, …

tidy_lifeExp <- gapminder_lifeExp %>%
    pivot_longer(-country, names_to = "year", values_to = "life_exp")
tidy_lifeExp
# A tibble: 40,953 x 3
   country     year  life_exp
   <chr>       <chr>    <dbl>
 1 Afghanistan 1800      28.2
 2 Afghanistan 1801      28.2
 3 Afghanistan 1802      28.2
 4 Afghanistan 1803      28.2
 5 Afghanistan 1804      28.2
 6 Afghanistan 1805      28.2
 7 Afghanistan 1806      28.1
 8 Afghanistan 1807      28.1
 9 Afghanistan 1808      28.1
10 Afghanistan 1809      28.1
# … with 40,943 more rows

tidy_income <- gapminder_income %>%
    pivot_longer(-country, names_to = "year", values_to = "income")
tidy_income
# A tibble: 46,513 x 3
   country     year  income
   <chr>       <chr>  <dbl>
 1 Afghanistan 1800     603
 2 Afghanistan 1801     603
 3 Afghanistan 1802     603
 4 Afghanistan 1803     603
 5 Afghanistan 1804     603
 6 Afghanistan 1805     603
 7 Afghanistan 1806     603
 8 Afghanistan 1807     603
 9 Afghanistan 1808     603
10 Afghanistan 1809     603
# … with 46,503 more rows

tidy_pop <- gapminder_population %>%
    pivot_longer(-country, names_to = "year", values_to = "pop")
tidy_pop
# A tibble: 58,695 x 3
   country     year      pop
   <chr>       <chr>   <dbl>
 1 Afghanistan 1800  3280000
 2 Afghanistan 1801  3280000
 3 Afghanistan 1802  3280000
 4 Afghanistan 1803  3280000
 5 Afghanistan 1804  3280000
 6 Afghanistan 1805  3280000
 7 Afghanistan 1806  3280000
 8 Afghanistan 1807  3280000
 9 Afghanistan 1808  3280000
10 Afghanistan 1809  3280000
# … with 58,685 more rows

gapminder_regions
# A tibble: 197 x 12
   geo   name  four_regions eight_regions six_regions members_oecd_g77 Latitude
   <chr> <chr> <chr>        <chr>         <chr>       <chr>               <dbl>
 1 afg   Afgh… asia         asia_west     south_asia  g77                  33  
 2 alb   Alba… europe       europe_east   europe_cen… others               41  
 3 dza   Alge… africa       africa_north  middle_eas… g77                  28  
 4 and   Ando… europe       europe_west   europe_cen… others               42.5
 5 ago   Ango… africa       africa_sub_s… sub_sahara… g77                 -12.5
 6 atg   Anti… americas     america_north america     g77                  17.0
 7 arg   Arge… americas     america_south america     g77                 -34  
 8 arm   Arme… europe       europe_east   europe_cen… others               40.2
 9 aus   Aust… asia         east_asia_pa… east_asia_… oecd                -25  
10 aut   Aust… europe       europe_west   europe_cen… oecd                 47.3
# … with 187 more rows, and 5 more variables: Longitude <dbl>, `UN member
#   since` <dttm>, `World bank region` <chr>, `World bank, 4 income groups
#   2017` <chr>, `World bank, 3 income groups 2017` <lgl>

tidy_regions <- gapminder_regions %>%
    select(country = name, region = four_regions)

Join

tidy_lifeExp
# A tibble: 40,953 x 3
   country     year  life_exp
   <chr>       <chr>    <dbl>
 1 Afghanistan 1800      28.2
 2 Afghanistan 1801      28.2
 3 Afghanistan 1802      28.2
 4 Afghanistan 1803      28.2
 5 Afghanistan 1804      28.2
 6 Afghanistan 1805      28.2
 7 Afghanistan 1806      28.1
 8 Afghanistan 1807      28.1
 9 Afghanistan 1808      28.1
10 Afghanistan 1809      28.1
# … with 40,943 more rows

tidy_income
# A tibble: 46,513 x 3
   country     year  income
   <chr>       <chr>  <dbl>
 1 Afghanistan 1800     603
 2 Afghanistan 1801     603
 3 Afghanistan 1802     603
 4 Afghanistan 1803     603
 5 Afghanistan 1804     603
 6 Afghanistan 1805     603
 7 Afghanistan 1806     603
 8 Afghanistan 1807     603
 9 Afghanistan 1808     603
10 Afghanistan 1809     603
# … with 46,503 more rows

inner_join(tidy_lifeExp, tidy_income, by = c("country", "year")) 
# A tibble: 40,953 x 4
   country     year  life_exp income
   <chr>       <chr>    <dbl>  <dbl>
 1 Afghanistan 1800      28.2    603
 2 Afghanistan 1801      28.2    603
 3 Afghanistan 1802      28.2    603
 4 Afghanistan 1803      28.2    603
 5 Afghanistan 1804      28.2    603
 6 Afghanistan 1805      28.2    603
 7 Afghanistan 1806      28.1    603
 8 Afghanistan 1807      28.1    603
 9 Afghanistan 1808      28.1    603
10 Afghanistan 1809      28.1    603
# … with 40,943 more rows

inner_join(tidy_lifeExp, tidy_income, by = c("country", "year")) %>%
    inner_join(tidy_pop, by = c("country", "year"))
# A tibble: 40,953 x 5
   country     year  life_exp income     pop
   <chr>       <chr>    <dbl>  <dbl>   <dbl>
 1 Afghanistan 1800      28.2    603 3280000
 2 Afghanistan 1801      28.2    603 3280000
 3 Afghanistan 1802      28.2    603 3280000
 4 Afghanistan 1803      28.2    603 3280000
 5 Afghanistan 1804      28.2    603 3280000
 6 Afghanistan 1805      28.2    603 3280000
 7 Afghanistan 1806      28.1    603 3280000
 8 Afghanistan 1807      28.1    603 3280000
 9 Afghanistan 1808      28.1    603 3280000
10 Afghanistan 1809      28.1    603 3280000
# … with 40,943 more rows

inner_join(tidy_lifeExp, tidy_income, by = c("country", "year")) %>%
    inner_join(tidy_pop, by = c("country", "year")) %>%
    left_join(tidy_regions, by = "country")
# A tibble: 40,953 x 6
   country     year  life_exp income     pop region
   <chr>       <chr>    <dbl>  <dbl>   <dbl> <chr> 
 1 Afghanistan 1800      28.2    603 3280000 asia  
 2 Afghanistan 1801      28.2    603 3280000 asia  
 3 Afghanistan 1802      28.2    603 3280000 asia  
 4 Afghanistan 1803      28.2    603 3280000 asia  
 5 Afghanistan 1804      28.2    603 3280000 asia  
 6 Afghanistan 1805      28.2    603 3280000 asia  
 7 Afghanistan 1806      28.1    603 3280000 asia  
 8 Afghanistan 1807      28.1    603 3280000 asia  
 9 Afghanistan 1808      28.1    603 3280000 asia  
10 Afghanistan 1809      28.1    603 3280000 asia  
# … with 40,943 more rows

gapminder <- inner_join(tidy_lifeExp, tidy_income, by = c("country", "year")) %>%
    inner_join(tidy_pop, by = c("country", "year")) %>%
    left_join(tidy_regions, by = "country") %>%
    select(country, region, year, pop, life_exp, income)
gapminder
# A tibble: 40,953 x 6
   country     region year      pop life_exp income
   <chr>       <chr>  <chr>   <dbl>    <dbl>  <dbl>
 1 Afghanistan asia   1800  3280000     28.2    603
 2 Afghanistan asia   1801  3280000     28.2    603
 3 Afghanistan asia   1802  3280000     28.2    603
 4 Afghanistan asia   1803  3280000     28.2    603
 5 Afghanistan asia   1804  3280000     28.2    603
 6 Afghanistan asia   1805  3280000     28.2    603
 7 Afghanistan asia   1806  3280000     28.1    603
 8 Afghanistan asia   1807  3280000     28.1    603
 9 Afghanistan asia   1808  3280000     28.1    603
10 Afghanistan asia   1809  3280000     28.1    603
# … with 40,943 more rows

gapminder18 <- gapminder %>%
    filter(year == 2018)

ggplot2

ggplot(gapminder18, aes(x = income, y = life_exp))

ggplot(gapminder18, aes(x = income, y = life_exp)) +
    geom_point()

gapminder18 %>%
    arrange(desc(pop)) %>%
    ggplot(aes(income, life_exp)) +
    geom_point(aes(fill = region, size = pop), shape = 21) +
    scale_x_log10(breaks = 2^(-1:7) * 1000) +
    scale_y_continuous(breaks = seq(0, 80, 10), limits = c(0, 90)) +
    scale_size(range = c(1, 20), guide = FALSE) +
    ggsci::scale_fill_aaas() +
    labs(x = "Income", y = "Life expectancy")

OK, mas ainda não sei como começar

Dicas finais: